/* b_SASID_master_gen.do -*****************************************************

	This file creates the list of SASIDs and Studentno, while ensuring that
	foreach studentno, we only keep the SASID associated with the earliest MCAS
	test score.

******************************************************************************/
clear
set more off

foreach t in oct eoy {
	local fyr 2001
	local lyr 2017

	*Eoy indicates the end of the school year so we need to push the year
	*forward one
	if "`t'"=="eoy" {
		local fyr 2002
		local lyr 2018
	}

	forvalues y = `fyr'/`lyr' {
		local yyo = substr("`y'",-2,2)
		use "$raw_data_sims/sims`t'`yyo'.dta", clear

		*Clean names and dates of birth
		foreach x in fname mname lname {
			replace `x' = lower(`x')
		}
		*the october 2010 and 2011 files use a different date convention.
		if inlist(`y',2010,2011) & "`t'"=="oct" {
			gen DOB = date(dob,"DMY",2050)
		}
		else {
			gen DOB = date(dob,"MDY")
		}
		format DOB %td

		rename dob olddob
		rename DOB dob
		gen fl = substr(fname,1,1)
		gen ll = substr(lname,1,1)
		gen mi = substr(mname,1,1)


		*Keep match variables
		keep sasid fname lname mname fl ll mi dob attend

		*Specify that the name is coming from SIMS data set
		foreach x in fname mname lname mi {
			rename `x' s`x'
		}

		*Merge with the primary name data
		joinby fl ll dob using "$stata_data_analysis/nsc/nsc_ready_fin.dta"

		*Account for some years having name variables
		cap drop name
		gen name = fname+" "+lname
		cap drop sname
		gen sname = sfname+" "+slname
		replace name = lower(name)


		*Calculate string distance
		strdist name sname, gen(dist)
		cap drop len
		gen len = length(name)
		cap drop slen
		gen slen = length(sname)
		cap drop ldist
		gen ldist = dist/(len+slen)
		gen lnamelen = length(lname)
		gen slnamelen = length(slname)

		* A lot of Hispanic names omit a second portion in BPS.
		gen flag = fname==sfname & lname==substr(slname,1,lnamelen)
		replace flag = fname==sfname & substr(lname,1,slnamelen)==slname

		* Flag a name if it satisfies the string distance requirement
		gen found_sim_dist = ldist<.2 | flag==1
		gen found_sim_exact = sname == name | flag==1

		*Only keep if the names match enough
		keep if found_sim_dist == 1

		*For each studentno pick the closest match
		destring attend, replace
		replace attend = 0 if attend == 555
		gsort studentno ldist -attend

		by studentno: keep if _n==1

		gsort sasid ldist year -attend
		by sasid: keep if _n==1
		destring sasid, gen(sasid_no) force

		*Merge in test data to see if an MCAS score exists
		merge 1:1 sasid_no using "$stata_data/test_wide.dta", keep(master match)

		*Keep relevant variables
		keep sasid sasid_no studentno earliest_mcas ldist yr_asgn attend

		compress
		save "$stata_data_crosswalks/SASID/`y'/sasid`t'.dta", replace

	}
}


clear
foreach t in oct eoy {
	local fyr 2001
	local lyr 2017
	if "`t'"=="eoy" {
		local fyr 2002
		local lyr 2018
	}

	forvalues y = `fyr'/`lyr' {
		append using "$stata_data_crosswalks/SASID/`y'/sasid`t'.dta"
		cap gen simyear = `y'
		replace simyear = `y' if missing(simyear)
	}
}

*Keep the observation with the earliest MCAS

gsort studentno earliest_mcas ldist simyear -attend
by studentno: keep if _n == 1

gsort sasid earliest_mcas ldist simyear -attend
by sasid: keep if _n == 1
keep sasid_no sasid studentno yr_asgn

compress
save "$stata_data_crosswalks/SASID_Studentno.dta", replace
